/*******************************************************************************
Project:		FZZ
Last modified: 	2021-11-16
Modified by:	Emily Bjorkman
Description:	This do-file downloads and cleans MEPS data.

*******************************************************************************/

clear 
program drop _all

* Define globals

gl localdir = "/Users/eb8378/Dropbox/FZZ"
gl datadir = "$localdir/data"
gl statadir = "$localdir/stata"
gl rawdir = "$statadir/raw"
gl dodir = "$statadir/do"
gl dtadir = "$statadir/dta"
gl logdir = "$statadir/log"
gl dumpdir = "$statadir/dump"
gl repfolder = "$localdir/emily workflow/meps_replication"

gl gpr = "plotregion(fcolor(white) lcolor(white)) graphregion(fcolor(white) lcolor(white))"



/*******************************************************************************
	(1) Download raw MEPS files
	
		Note: only need to run once to download.
*******************************************************************************/

/* 	This program downloads national MEPS .xls pre-packaged data files on insurance 
	coverage and premia from the MEPS Insurance Component. Orginally written by
	DS in 2020. Modified to include 2019. */
program load_meps_national_insurance

capture !mkdir $datadir/MEPS/prepared_tables
capture !mkdir $datadir/MEPS/prepared_tables/nationalIC
cd $datadir/MEPS/prepared_tables/nationalIC

forv year = 1996/2019 {
	capture copy https://meps.ahrq.gov/mepsweb/data_stats/summ_tables/insr/excel/`year'/UnitedStates`year'.xls ///
		UnitedStates`year'.xls

}
end program

/* 	This program downloads regional MEPS .xls pre-packaged data files on insurance 
	coverage and premia from the MEPS Insurance Component. Orginally written by
	DS in 2020. Modified to include 2019. */
program load_meps_regional_insurance

	* Load state names
	
import delim using "$repfolder/st-est80_90.csv", clear

keep v1 
replace v1 = subinstr(subinstr(v1, ".", "", 1), " ", "", .)
drop if v1 == "UnitedStates"

qui levelsof v1, clean local(statelist)


	/* Change working directory to the folder we're going to copy to. Then, 
		cycling through states and years, load in Excel tables from the 
		Internet. Not every year of data is available for every state, and
		that's okay. */


capture !mkdir $datadir/MEPS/prepared_tables/regionalIC
cd $datadir/MEPS/prepared_tables/regionalIC

forv year = 1996/2019 {
	foreach state in `statelist' {
		capture copy https://meps.ahrq.gov/mepsweb/data_stats/summ_tables/insr/excel/`year'/`state'`year'.xls ///
			`state'`year'.xls

	}
}
end program

/*	This program downloads MEPS programming statements which convert
	raw MEPS files in ASCII format to Full Year Consolidated files
	in .dta format. Modified to include 2018 and 2019.
	Note: must manually edit these do files to update the directory and file save.
	Used in build_meps_yearspecific_microfile.do (not included in this file) */
program load_meps_programming_statements
	
cd "$repfolder/fullyearconsolidated"

foreach filenum in 12 20 28 38 50 60 70 79 89 97 105 113 121 129 138 147 155 163 ///
	171 181 192 201 209 216 {

	copy https://meps.ahrq.gov/mepsweb/data_stats/download_data/pufs/h`filenum'/h`filenum'stu.txt ///
		h`filenum'stu.do, replace
}
end program





/*******************************************************************************
	(2) Build MEPS analysis data files
	
		Note: these are based on a do-file written by DS in June 2020.
		There are five other MEPS build files written at the same time,
			
			build_meps_yearspecific_microfile.do
			build_meps_cbsa_insurance.do
			build_meps_regional_insurance.do
			build_meps_expenditures_averages.do
			
		that are not included here.
*******************************************************************************/



/*******************************************************************************
	(2.1) Cycle through years and retrieve rows from tables that are interesting 
		to us
*******************************************************************************/

	/***************************************************************************
		(2.1.1) Load in all the raw Excel files and process them: select relevant
			rows and ensure that we're getting the right rows
	***************************************************************************/

local files : dir "$repfolder/nationalIC" files "*.xls"

foreach filename in `files' {

	import excel using "$repfolder/nationalIC/`filename'", ///
		cellrange(A2) clear

	local year = substr("`filename'", -8, 4)

	keep A-C
	foreach strvar of varlist * {
		qui replace `strvar' = trim(`strvar')
	}

	assert A[1] == "Table No." 
	rename A TableNum

	assert B[1] == "Table Description"
	rename B TableDesc

	assert C[1] == "Total"
	rename C v

	qui drop in 1

	qui replace TableNum = upper(subinstr(TableNum, ".", "", .))
	qui replace TableNum = upper(subinstr(TableNum, "(", "", .))
	qui replace TableNum = upper(subinstr(TableNum, ")", "", .))
	
	qui keep if inlist(TableNum, "IIB1", "IIB2", "IIB2A1", "IIB2B") | ///
			inlist(TableNum, "IIC1", "IIC2", "IIC4") | ///
			inlist(TableNum, "IID1", "IID2", "IID4") | ///
			inlist(TableNum, "IIE1", "IIE2", "IIE4")

	gen year = `year'

	order year

	tempfile UnitedStates`year'
	qui save `UnitedStates`year''
}

	/***************************************************************************
		(2.1.2) Append all the tempfiles
	***************************************************************************/

clear

foreach filename in `files' {
	local tempfilename = subinstr("`filename'", ".xls", "", 1)
	append using ``tempfilename''
}

/*******************************************************************************
	(2.2) Clean up appended tables
*******************************************************************************/

	/***************************************************************************
		(2.2.1) Ensure table descriptions are constant within table numbers;
			then make shorter table descriptions and get rid of table numbers
	***************************************************************************/

sort TableNum
levelsof TableNum, clean local(TableNums)

foreach TableNum in `TableNums' { 
	tempvar firstTableDesc`TableNum'

	by TableNum: gen `firstTableDesc`TableNum'' = TableDesc[1]
	assert TableDesc == `firstTableDesc`TableNum'' if TableNum == "`TableNum'"
}

replace TableDesc = "NumEmployees" if TableNum == "IIB1"
replace TableDesc = "PctEmpAtEstOfferingHI" if TableNum == "IIB2"
replace TableDesc = "PctEshiTakeup" if TableNum == "IIB2A1"
replace TableDesc = "PctEnrlldAtEstOfferingHI" if TableNum == "IIB2B"

replace TableDesc = "AvgSinglePremPerEmp" if TableNum == "IIC1"
replace TableDesc = "AvgEmpContribPerEmpSingleCvg" if TableNum == "IIC2"
replace TableDesc = "PctEmpWithSingleCvg" if TableNum == "IIC4"

replace TableDesc = "AvgFamilyPremPerEmp" if TableNum == "IID1"
replace TableDesc = "AvgEmpContribPerEmpFamilyCvg" if TableNum == "IID2"
replace TableDesc = "PctEmpWithFamilyCvg" if TableNum == "IID4"

replace TableDesc = "AvgEPlus1PremPerEmp" if TableNum == "IIE1"
replace TableDesc = "AvgEmpContribPerEmpEPlus1Cvg" if TableNum == "IIE2"
replace TableDesc = "PctEmpWithEPlus1Cvg" if TableNum == "IIE4"

drop TableNum
capture drop __000* // Tempvar residue

	/***************************************************************************
		(2.2.2) Reshape so employee number categories are long and table 
			descriptions are wide
	***************************************************************************/

reshape wide v, i(year) j(TableDesc, string)
rename v* *

	/***************************************************************************
		(2.2.3) Code missing data numerically, then destring averages and 
			percents
	***************************************************************************/

foreach numvar of varlist Avg* Pct* Num* {
	replace `numvar' = subinstr(`numvar', ",", "", .)
	replace `numvar' = subinstr(`numvar', "%", "", .)
	replace `numvar' = subinstr(`numvar', " ", "", .) 

	assert inlist(`numvar', "", "no data", "suppressed") if missing(real(`numvar'))
	replace `numvar' = "-1" if inlist(`numvar', "", "no data")
	replace `numvar' = "-3" if `numvar' == "suppressed" 
}

destring Avg* Pct* Num*, replace

* Adjust 2019 to be the same units as other years
foreach var in PctEmpAtEstOfferingHI PctEmpWithEPlus1Cvg PctEmpWithFamilyCvg ///
	PctEmpWithSingleCvg PctEnrlldAtEstOfferingHI PctEshiTakeup {
		replace `var' = `var'/100 if year==2019
	}

	/***************************************************************************
		(2.2.4) Make average aggregate premiums and employee contributions vars,
			weighting using percentage of employees with each coverage type
	***************************************************************************/

/* In years before we have employee + 1 coverage in tables, single and family 
	coverage are mutually exclusive and collectively exhaustive */
assert PctEmpWithSingleCvg + PctEmpWithFamilyCvg == 1 if inlist(PctEmpWithEPlus1Cvg, -1, -3)

/* If we're missing percent of employees with single or family coverage, we're also
	missing corresponding averages */
assert inlist(AvgSinglePremPerEmp, -1, -3) & inlist(AvgEmpContribPerEmpSingleCvg, -1, -3) ///
	if inlist(PctEmpWithSingleCvg, -1, -3)

assert inlist(AvgFamilyPremPerEmp, -1, -3) & inlist(AvgEmpContribPerEmpFamilyCvg, -1, -3) ///
	if inlist(PctEmpWithFamilyCvg, -1, -3)

#delimit ;
gen AvgPremPerEmp = cond(inlist(PctEmpWithSingleCvg, -1, -3) |
							inlist(PctEmpWithFamilyCvg, -1, -3),
							-1, /* code as no data if missing fam + single cvg */
					cond(inlist(PctEmpWithEPlus1Cvg, -1, -3), 
						(AvgSinglePremPerEmp * PctEmpWithSingleCvg) 
						+ (AvgFamilyPremPerEmp * PctEmpWithFamilyCvg),
						(AvgSinglePremPerEmp * PctEmpWithSingleCvg) 
						+ (AvgFamilyPremPerEmp * PctEmpWithFamilyCvg)
						+ (AvgEPlus1PremPerEmp * PctEmpWithEPlus1Cvg)));

gen AvgEmpContribPerEmp = cond(inlist(PctEmpWithSingleCvg, -1, -3) |
							inlist(PctEmpWithFamilyCvg, -1, -3),
							-1, /* code as no data if missing fam + single cvg */
					cond(inlist(PctEmpWithEPlus1Cvg, -1, -3), 
						(AvgEmpContribPerEmpSingleCvg * PctEmpWithSingleCvg) 
						+ (AvgEmpContribPerEmpFamilyCvg * PctEmpWithFamilyCvg),
						(AvgEmpContribPerEmpSingleCvg * PctEmpWithSingleCvg) 
						+ (AvgEmpContribPerEmpFamilyCvg * PctEmpWithFamilyCvg)
						+ (AvgEmpContribPerEmpEPlus1Cvg * PctEmpWithEPlus1Cvg)));

assert AvgPremPerEmp > 0 & AvgEmpContribPerEmp > 0 
	if !inlist(PctEmpWithSingleCvg, -1, -3) & !inlist(PctEmpWithFamilyCvg, -1, -3);
#delimit cr

label define missingmeps -3 "Suppressed" -1 "No data"
label values Avg* Pct* missingmeps

/*******************************************************************************
	(2.3) Adjust averages for inflation
*******************************************************************************/

preserve

set fredkey 1d6fd8a4c9369fd163523f9dcc33a35b
import fred PCEPI, vintage(2020-09-11) clear
rename PCEPI_20200911 PCEPI 

gen year = real(substr(datestr, 1, 4))

collapse (mean) PCEPI, by(year)

assert year[_N - 1] == 2019

gen pceadjfactor2019 = PCE[_N - 1] / PCE

tempfile pce
save `pce'

restore

merge 1:1 year using `pce', assert(2 3) keep(3) keepusing(pceadjfactor2019) nogen

foreach expenditure of varlist Avg* {
	replace `expenditure' = `expenditure' * pceadjfactor2019 if !inlist(`expenditure', -1, -3)
}
drop pceadjfactor2019

lab var AvgPremPerEmp "Avg premium per enrolled emp. at estblshmts offering HI (2019 USD)"
lab var AvgEmpContribPerEmp "Avg emp. contribution per enrolled emp. (2019 USD)"

lab var AvgSinglePremPerEmp "Avg single premium per enrolled emp. at estblshmts offering HI (2019 USD)"
lab var AvgEmpContribPerEmpSingleCvg "Avg emp. contribution per enrolled emp. for single coverage (2019 USD)"

lab var AvgFamilyPremPerEmp "Avg family premium per enrolled emp. at estblshmts offering HI (2019 USD)"
lab var AvgEmpContribPerEmpFamilyCvg "Avg emp. contribution per enrolled emp. for family coverage (2019 USD)"

lab var AvgEPlus1PremPerEmp "Avg emp.+1 premium per enrolled emp. at estblshmts offering HI (2019 USD)"
lab var AvgEmpContribPerEmpEPlus1Cvg "Avg emp. contribution per enrolled emp. for emp.+1 coverage (2019 USD)"

/*******************************************************************************
	(2.4) Put variables in order, sort, and save
*******************************************************************************/

order year NumEmployees PctEmpAtEstOfferingHI PctEnrlldAtEstOfferingHI ///
	PctEshiTakeup AvgPremPerEmp AvgEmpContribPerEmp ///
	AvgSinglePremPerEmp AvgEmpContribPerEmpSingleCvg ///
	AvgFamilyPremPerEmp AvgEmpContribPerEmpFamilyCvg /// 
	AvgEPlus1PremPerEmp AvgEmpContribPerEmpEPlus1Cvg
	
sort year

save "$repfolder/meps_national_insurance_averages.dta", replace


/*******************************************************************************
	(3) Final cleaning and merge
	
		Note: this is sourced from "1_build.do" by PC in pat workflow. It does a
				little extra cleaning and merges the MEPS data, nmces_1977, 
				nmes_1987, and ecec_hc_costs.xlsx
*******************************************************************************/

	/***************************************************************************
		(3.1) This imports the FRED series PCEPI and creates a tempfile to
			adjust for inflation. Same code as the top of section 2.3.
	***************************************************************************/

        set fredkey 1d6fd8a4c9369fd163523f9dcc33a35b
		import fred PCEPI, vintage(2020-09-11) clear
		rename PCEPI_20200911 PCEPI 

		gen year = real(substr(datestr, 1, 4))

		collapse (mean) PCEPI, by(year)

		assert year[_N - 1] == 2019

		gen pceadjfactor2019 = PCE[_N - 1] / PCE
		
        tempfile pce
		save `pce'
		
	/***************************************************************************
		(3.2) This cleans the MEPS national insurance dta file. Renames
				variables and calculates worker/employer contributions/shares.
	***************************************************************************/
		
		*Load data, create variables for total, firm and worker health costs
		use "$repfolder/meps_national_insurance_averages.dta", clear
        rename AvgPremPerEmp meps_per_enrollee
        gen meps_enrollment = NumEmployees * PctEmpAtEstOfferingHI * PctEnrlldAtEstOfferingHI / 1000000
        gen meps_expend = meps_enrollment * meps_per_enrollee / 1000
		
        gen meps_total_private = meps_expend
        gen meps_firm_private = meps_enrollment * (meps_per_enrollee - AvgEmpContribPerEmp) / 1000
        gen meps_worker_private = meps_enrollment * (AvgEmpContribPerEmp) / 1000

		* Renaming variables
        rename AvgEPlus1PremPerEmp prem_plus_one_total
        rename AvgEmpContribPerEmpEPlus1Cvg prem_plus_one_worker

        rename AvgFamilyPremPerEmp prem_family_total
        rename AvgEmpContribPerEmpFamilyCvg prem_family_worker

        rename AvgSinglePremPerEmp prem_single_total
        rename AvgEmpContribPerEmpSingleCvg prem_single_worker

		rename PctEshiTakeup eshitakeup
        rename PctEmpWithSingleCvg share_in_single
        rename PctEmpWithFamilyCvg share_in_family
        rename PctEmpWithEPlus1Cvg share_in_plus_one
        replace share_in_plus_one = cond(year < 2001, ., share_in_plus_one)

        * Creates variable for firm spending on premiums and worker share of premium
        foreach prem_type in "plus_one" "family" "single" {
            gen prem_`prem_type'_firm = prem_`prem_type'_total - prem_`prem_type'_worker
            gen prem_`prem_type'_workshare = prem_`prem_type'_worker / prem_`prem_type'_total * 100

            replace share_in_`prem_type' = share_in_`prem_type' * 100
        }

        foreach outcome in "total" "worker" "firm" "workshare" {
            replace prem_plus_one_`outcome' = cond(year < 2001, ., prem_plus_one_`outcome')
        }

        tempfile cleaned_meps
		save `cleaned_meps'

		
	/***************************************************************************
		(3.3) This cleans the NMCES 1977 file. Keeps employer-sponsored health 
		insurance coverage, group coverage and covered through employment.
		Calculates total premiums and employer contribution adjusted to
		2019 dollars.
	***************************************************************************/
		
		/* This opens the raw file for the nmces_1977 data. it was manually 
		downloaded. Further documentation on the source and download date can be
		found in FZZ\github tasks\[FZZ-11] Investigate ESHI Premiums\code\raw. */
		
		infix str pid 1-8 str prminpid 18-24 double emprelp 26-26 double prseprg 157-164 ///
            double groupp 27-27 double groups 100-100 double wtinsp 238-242 ///
            double priminsd 98-98 double emprels 99-99 double prstotg 141-148 ///
            using "$repfolder/NMCES_1977/data.txt", clear 
			
			
        ** Keep only employer-sponsored health insurance coverage **
        keep if emprelp == 2
        ** Keep only group coverage **
        keep if groupp == 2 & groups == 2
        ** Keep if they are covered through employment 
        keep if priminsd == 1 & emprels == 2

        ** Merge in inflation adjustment **
        gen year = 1977
        merge m:1 year using `pce', assert(2 3) keep(3) keepusing(pceadjfactor2019) nogen

        ** Calculate total premiums and employer contribution in 2019 $ **
        foreach outcome in prstotg prseprg {
            replace `outcome' = `outcome' * pceadjfactor2019
            sum `outcome' [aw=wtinsp]

            if "`outcome'" == "prstotg" {
                gen avg_prem = r(mean)
            }
            else {
                gen avg_emp_prem = r(mean)
            }
        }

        keep year avg_*
        duplicates drop
		
		tempfile nmces_1977_spend
		save `nmces_1977_spend'
		

		
	/***************************************************************************
		(3.4) This cleans the NMES 1987 file. Keeps employer-sponsored health 
		insurance coverage, group coverage and covered through employment.
		Calculates total premiums and employer contribution adjusted to
		2019 dollars.
	***************************************************************************/
		
		/* This opens the raw file for the nmces_1977 data. it was manually 
		downloaded. Further documentation on the source and download date can be
		found in FZZ\github tasks\[FZZ-11] Investigate ESHI Premiums\code\raw. */
		
        infix str rcordidx 1-15 str phldridx 24-31 double groupcov 51-52 ///
            double datasrce 46-46 double typex 50-50 double totpremx 88-95 ///
            double empcontx 106-113 double groupcox 57-57 double postjo2 528-539 ///
            char cgroup 524-525 using ///
			"$repfolder/NMES_1987/data.txt", clear

        ** Drop the federal government as an insurer **
        drop if datasrce == 3
         ** Drop retirees and survivors: we want employees **
        keep if typex == 1
        ** Keep if employed and insurance is private **
        keep if cgroup == 1
        ** Keep if E2 Org provides group coverage **
        keep if groupcov == 1

        ** Merge in PCEPI **
        gen year = 1987
        merge m:1 year using `pce', assert(2 3) keep(3) keepusing(pceadjfactor2019) nogen

        foreach outcome in totprem empcontx {
            replace `outcome' = `outcome' * pceadjfactor2019
            egen `outcome'_person = sum(`outcome'), by(phldridx)
            sum `outcome'_person [aw=postjo2]

            if "`outcome'" == "totprem" {
                gen avg_prem = r(mean)
            }
            else {
                gen avg_emp_prem = r(mean)
            }
        }

        keep year avg_*
        duplicates drop
		
		tempfile nmes_1987_spend
        save `nmes_1987_spend'
		
		
		
	/***************************************************************************
		(3.5) This cleans ecec_hc_costs.xlsx, which was dowloaded in January
				2020. Further documentation of manual download found in the
				source folder. 
	***************************************************************************/		
		
	*import excel "$localdir/github tasks/[FZZ-11] Investigate ESHI Premiums/code/raw/BLS/ecec_hc_costs.xlsx", ///
	*firstrow clear
	import excel "$repfolder/ecec_hc_costs.xlsx", firstrow clear
    replace ecec = cond(year == 2017, 2.50, ecec)
    tsset year
	
	lab var eci "Employment Cost Index"
	*lab var ecec
	
    merge 1:1 year using `pce', assert(2 3) keep(3) keepusing(pceadjfactor2019) nogen
    gen ecec_growth = (((ecec - F1.ecec) / F1.ecec) - (pceadjfactor2019 - F1.pceadjfactor2019)) * 100
    gen nhea_emp_growth = nhea_table - (pceadjfactor2019 - L1.pceadjfactor2019) * 100
    gen nhea_growth = nhea_total - (pceadjfactor2019 - L1.pceadjfactor2019) * 100

	tempfile ecec_growth
	save `ecec_growth'
		
		
/*******************************************************************************
	(3.6) This merges together the cleaned ecec_growth file (no access right now)
		and the cleaned meps file. Creates variable for average employer premium
**# Bookmark #1
		and average premium.
*******************************************************************************/		
		

    use `ecec_growth'
    merge 1:1 year using `cleaned_meps', assert(1 3) keep(1 3) nogen
    tsset year

    gen avg_prem = meps_per_enrollee
	gen avg_emp_prem = meps_per_enrollee - AvgEmpContribPerEmp
	gen avg_empcost_takeup = avg_emp_prem * eshitakeup
	
	rename prem_single_total avg_prem_single
	rename prem_family_total avg_prem_family
	rename prem_plus_one_total avg_prem_plusone

    replace avg_emp_prem = cond(year == 2007, F.avg_emp_prem * (1 + F.nhea_emp_growth / 100), avg_emp_prem)
    
	foreach var in avg_prem avg_prem_single avg_prem_family avg_prem_plusone {
		replace `var' = cond(year == 2007, F.`var' * (1 + F.nhea_emp_growth / 100), `var')	
	}
	
    keep if inrange(year, 1996, 2019)

    keep year avg_* eshitakeup
	
	tempfile meps_1996_2019_spend
    save `meps_1996_2019_spend'


/*******************************************************************************
	(3.7) This merges together everything and saves as a tempfile
*******************************************************************************/

    use `meps_1996_2019_spend'
    qui append using `nmces_1977_spend'
    qui append using `nmes_1987_spend'
    sort year
	
	save "$repfolder/premium_by_year", replace

	mkmat year eshitakeup avg_emp_prem avg_prem avg_empcost_takeup, ///
		mat(premium_series)
	putexcel set "$repfolder/premium_series", replace
	putexcel A2 = mat(premium_series)
		
		
		